# Load necessary libraries
require(tidyverse)  # For data manipulation and visualization
require(sf)

# Load the data
load("00_data/01_data_processed/epa_empl.Rdata") 

# Define Phase 1 states with full names
PHASE_1_STATES <- c(
  "INDIANA", "IOWA", "KANSAS", "KENTUCKY", "MARYLAND", "MICHIGAN",
  "MINNESOTA", "MISSISSIPPI", "MISSOURI", "NEW HAMPSHIRE", "NEW JERSEY",
  "NEW YORK", "ALABAMA", "OHIO", "PENNSYLVANIA", "TENNESSEE", "WEST VIRGINIA",
  "WISCONSIN", "GEORGIA", "FLORIDA", "ILLINOIS"
)

# Define EPA treatment regions
EPA_TREAT <- c("01", "02", "03", "04", "05", "07")

# Map states to EPA regions
state_to_region <- list(
  `01` = c("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT"),
  `02` = c("NEW JERSEY", "NEW YORK", "PUERTO RICO", "U.S. VIRGIN ISLANDS"),
  `03` = c("DELAWARE", "DISTRICT OF COLUMBIA", "MARYLAND", "PENNSYLVANIA", "VIRGINIA", "WEST VIRGINIA"),
  `04` = c("ALABAMA", "FLORIDA", "GEORGIA", "KENTUCKY", "MISSISSIPPI", "NORTH CAROLINA", "SOUTH CAROLINA", "TENNESSEE"),
  `05` = c("ILLINOIS", "INDIANA", "MICHIGAN", "MINNESOTA", "OHIO", "WISCONSIN"),
  `06` = c("ARKANSAS", "LOUISIANA", "NEW MEXICO", "OKLAHOMA", "TEXAS"),
  `07` = c("IOWA", "KANSAS", "MISSOURI", "NEBRASKA"),
  `08` = c("COLORADO", "MONTANA", "NORTH DAKOTA", "SOUTH DAKOTA", "UTAH", "WYOMING"),
  `09` = c("ARIZONA", "CALIFORNIA", "HAWAII", "NEVADA", "AMERICAN SAMOA", "GUAM", "NORTHERN MARIANA ISLANDS"),
  `10` = c("ALASKA", "IDAHO", "OREGON", "WASHINGTON")
)

# Function to find the EPA region for a given state
find_region <- function(state_name) {
  for (region in names(state_to_region)) {
    if (state_name %in% state_to_region[[region]]) {
      return(region)
    }
  }
  return(NA)
}

# Summarize EPA employment data by state and year
EPA_EMPL_COUNTY <- EPA_EMPL_COUNTY %>% 
  group_by(State, year) %>% 
  summarize(
    number = n(),
    median_grade = median(grade_numeric, na.rm = TRUE),
    median_edu = median(education_numeric, na.rm = TRUE),
    mean_grade = mean(grade_numeric, na.rm = TRUE),
    mean_edu = mean(education_numeric, na.rm = TRUE)
  )

# Assign EPA regions and flags
EPA_EMPL_COUNTY$epa_region <- sapply(EPA_EMPL_COUNTY$State, find_region)
EPA_EMPL_COUNTY$treat <- EPA_EMPL_COUNTY$State %in% PHASE_1_STATES 
EPA_EMPL_COUNTY$treat_region <- EPA_EMPL_COUNTY$epa_region %in% EPA_TREAT 
EPA_EMPL_COUNTY$time_to_treat <- EPA_EMPL_COUNTY$year - 1995 

# Filter data
treat_year <- EPA_EMPL_COUNTY %>% 
  filter(State != "DISTRICT OF COLUMBIA") %>% 
  filter(year >= 1989 & year <= 2001)

# Summarize by region, year, and treatment
treat_year_epa <- treat_year %>% 
  group_by(epa_region, year, treat_region, time_to_treat) %>%
  summarize(number = sum(number))

# Plot the percentage change in EPA employees by treatment status over time
treat_year_epa %>% 
  mutate(Treatment = factor(if_else(treat_region == TRUE, 1, 0))) %>% 
  group_by(epa_region) %>% 
  arrange(year) %>% 
  mutate(pct_change = (number / lag(number) - 1) * 100) %>% 
  filter(!is.na(pct_change)) %>% 
  group_by(Treatment, year) %>% 
  summarize(mean_pct_change = mean(pct_change)) %>% 
  ggplot(aes(x = year, y = mean_pct_change, color = Treatment, fill = Treatment)) +
  geom_point() +
  geom_line() +
  theme_bw() +
  scale_x_continuous(breaks = 1990:2001) +
  ylab("Pct. change of EPA employees") +
  xlab("Year") +
  ylim(-10, 20) +
  geom_hline(yintercept = 0, linetype = 1) +
  geom_vline(xintercept = 1994, linetype = 2) +
  geom_vline(xintercept = 1999, linetype = 2) +
  scale_color_manual(values = rev(c("#0A2463", "#FB3640"))) +
  scale_fill_manual(values = rev(c("#0A2463", "#FB3640"))) +
  theme(legend.position = c(0.9, 0.2),
        legend.background = element_rect(fill = "white",
                                         linewidth = 0.2, 
                                         linetype = "solid", 
                                         colour = "black")) +
  coord_cartesian(ylim = c(-20, 20))

# Save the plot
ggsave("03_figures/figure2.pdf", height = 3, width = 8)
ggsave("03_figures/figure2.jpg", height = 3, width = 8, dpi=600)

rm(list = ls())
